Introduction to R
for Social Scientists

Workshop Day 2A | 2022-07-26
Jeffrey M. Girard | Pitt Methods

Program II

Missing Values

  • Sometimes your data will have missing values
    • Perhaps these were never collected
    • Perhaps the values were lost/corrupted
    • Perhaps the participant didn’t respond
  • We need to tell R which values are missing
    • To do so, we set those values to NA
    • Functions from tidyverse make this easy
  • Missingness is often “contagious” in R
    e.g., a vector with NA has an unknown mean

Missing Values Live Coding

# SETUP: We will need tidyverse for the read and mutate functions

library(tidyverse)

# ==============================================================================

# PITFALL: Number codes for missingness will mess up calculations in R

heights <- c(149, 158, -999) # here we use -999 to represent a missing value

range(heights)

mean(heights)

log(heights) # our missing value is no longer -999

# ==============================================================================

# USECASE: Use NA for missingness instead

heights2 <- c(149, 158, NA)
heights2

log(heights2) # the NA stayed an NA (due to contagiousness)

# ==============================================================================

# LESSON: Use na.rm = TRUE to do a summary function ignoring the NAs

mean(heights2) # the mean is an NA (due to contagiousness)

mean(heights2, na.rm = TRUE)

range(heights2, na.rm = TRUE)

# ==============================================================================

# USECASE: Dealing with missing values in tibbles

cereal <- read_csv("cereal.csv")

cereal$rating

range(cereal$rating)

# ==============================================================================

# LESSON: Use na_if() to convert specific values to NA while mutating

cereal2 <- mutate(cereal, rating = na_if(rating, -999))

cereal2$rating

range(cereal2$rating, na.rm = TRUE)

# ==============================================================================

# LESSON: Use read_csv(na) to convert specific values to NA while reading

cereal3 <- read_csv("cereal.csv", na = "-999")

cereal3$rating

range(cereal3$rating, na.rm = TRUE)

Factors

  • Factors are used to represent categorical data
    • Factors have multiple possible levels
    • Levels are discrete and mutually-exclusive
  • Sometimes categories are unordered (nominal)
    • Action or Comedy or Drama
    • Asia or Europe or North America
  • Sometimes categories are ordered (ordinal)
    • Mild < Medium < Hot
    • XS < S < M < L < XL

Factors Live Coding

# USECASE: Ask 10 kids to order 1: nuggets, 2: pizza, or 3: salad

food <- c(2, 2, 1, 2, 1, 2, 1, 1, 2, 2)
food

# ==============================================================================

# LESSON: We can turn this vector into a factor with factor()

food2 <- factor(food, levels = c(1, 2, 3))
food2

food3 <- factor(food, levels = c(1, 2, 3),
                labels = c("nuggets", "pizza", "salad"))
food3

# ==============================================================================

# USECASE: We can also quickly and easily count each level with table()

table(food3)

# ==============================================================================

# PITFALL: Don't confuse levels and labels

food4 <- factor(food, labels = c(1, 2, 3),
                levels = c("nuggets", "pizza", "salad"))
food4 # full of <NA> because it can't find these levels

# ==============================================================================

# USECASE: You can also just enter strings directly (as self-labels)

genre <- c("pop", "metal", "pop", "rock", "rap", "rap", "pop", "rock")
genre

genre2 <- factor(genre) # observed levels will be assigned alphabetically
genre2

table(genre2)

# ==============================================================================

# LESSON: If ordinal, enter levels low-to-high and add ordered = TRUE

salsa <- c("hot", "mild", "medium", "mild", "medium", "medium")

salsa2 <- factor(salsa, 
                 levels = c("mild", "medium", "hot"), 
                 ordered = TRUE)
salsa2 

# NOTE: We may want to visualize or model ordinal factors differently

# ==============================================================================

# USECASE: Working with factors in a tibble

cereal <- read_csv("cereal.csv")
cereal

cereal2 <- mutate(cereal, mfr = factor(mfr), type = factor(type))
cereal2

table(cereal2$mfr)

table(cereal2$type)

String Manipulation

  • It can be helpful to transform strings
    • Sometimes your data is text-based
    • Text also appears in tables and figures
    • So we may want to customize it quickly
  • Tidyverse contains tools for this

String Manipulation Live Coding

# SETUP: We will need tidyverse for almost all of these functions

library(tidyverse)

# ==============================================================================

# USECASE: Re-capitalize strings

x <- c("R4SS: Introduction to R for Social Scientists")

str_to_lower(x)

str_to_upper(x)

str_to_sentence(x)

str_to_title(x)

# ==============================================================================

# USECASE: Extract subsets of strings

x <- c("Apple", "Banana", "Pear")

str_sub(x, start = 1, end = 3)

str_sub(x, start = -3, end = -1)

str_sub(x, start = 2, end = -2)

str_sub(x, start = 1, end = 5) # can go beyond the end

# ==============================================================================

# USECASE: Remove whitespace from strings

x <- "  Sometimes strings have   too   much white space "
x

str_trim(x) # remove white space at the start and end

str_squish(x) # trim and then collapse inner white space

# ==============================================================================

# USECASE: Remove and replace patterns in strings

x <- "Scientists very often utilize very fancy words, 
      even when they could utilize simpler ones."

str_remove(x, pattern = "very ") # removes first pattern match only

str_remove_all(x, pattern = "very ") # removes all pattern matches

str_replace(x, pattern = "utilize", replacement = "use")

str_replace_all(x, pattern = "utilize", replacement = "use")

# NOTE: More complex patterns can be found using regular expressions (regex)

# ==============================================================================

# USECASE: Create a string manipulation pipeline

x_clean <- 
  x |> 
  str_remove_all("very ") |> 
  str_replace_all("utilize", "use") |> 
  print()

If Else

  • A locked door behaves conditionally
    • If you have the key, then open up…
    • Otherwise, stay closed…
  • Sometimes we want code to behave conditionally
    • Filter retains observations conditionally
      (e.g., if it meets a condition, it gets to stay)
    • Let’s learn to transform variables conditionally
    • We can use if_else() for simple examples

If Else Live Coding

# SETUP: We will need tidyverse for almost all of these functions

library(tidyverse)

# ==============================================================================

# USECASE: Determining whether someone can vote in the US

age <- 12

age_group <- 
  if_else(
    condition = age >= 18,
    true = "adult",
    false = "child"
  ) |> 
  print()

# ==============================================================================

# TIP: Because argument names are optional, we can shorten this (if we want)

age_group <- if_else(age >= 18, "adult", "child") |> print()

# ==============================================================================

# LESSON: This function is particularly useful applied to vectors

ages <- c(13, 18, 14, 19, 22, 16)

age_groups <- if_else(ages >= 18, "adult", "child") |> print()

# ==============================================================================

# USECASE: We can therefore use it during data wrangling

cereal <- read_csv("cereal.csv", na = "-999")

cereal2 <- mutate(cereal, popular = if_else(rating > 50, "yes", "no"))
cereal2

cereal3 <- 
  cereal |>
  mutate(
    diabetes = if_else(
      condition = sugars == 0, 
      true = "sugar-free", 
      false = "contains sugar"
    )
  ) |> 
  print()

Case When

  • An elevator also behaves conditionally
    • If you press a button, then it goes to that floor
    • There are usually more than just two buttons
    • In this analogy (but not in real life), the elevator only responds to the first button pressed
  • Sometimes we want code to behave this way
    • case_when() expands upon if_else()
    • It can have multiple conditions (floor buttons)
    • The first condition met “wins” (picks the floor)

Case When Live Coding

# SETUP: We will need tidyverse for almost all of these functions

library(tidyverse)

# ==============================================================================

# USECASE: Determine what types of movies your kids can watch

ages <- c(11, 13, 18)

movies_allowed <- 
  case_when(
    ages >= 17 ~ "R",
    ages >= 13 ~ "PG-13",
    ages < 13 ~ "PG"
  ) |> 
  print()

# ==============================================================================

# PITFALL: Don't put the least restrictive condition first

age <- 18

movies_allowed2 <- 
  case_when(
    age < 13 ~ "PG",
    age >= 13 ~ "PG-13",
    age >= 17 ~ "R"
  ) |> 
  print() # age >= 13, so PG-13 wins before checking if age >= 17

# ==============================================================================

# USECASE: Use case_when to re-code variables during data wrangling

starwars

sw <- 
  starwars |> 
  mutate(
    species3 = case_when(
      species == "Human" ~ "Human",
      species == "Droid" ~ "Droid",
      species != "Human" & species != "Droid" ~ "Alien"
    )
  ) |> 
  select(name, species3) |> 
  print()

# ==============================================================================

# TIP: The next version of case_when() will add the .default argument
# This is where the elevator will drop you off if you hit no buttons

sw <- 
  starwars |> 
  mutate(
    species3 = case_when(
      species == "Human" ~ "Human",
      species == "Droid" ~ "Droid",
      .default = "Alien"
    )
  ) |> 
  select(name, species3) |> 
  print()

# NOTE: The above code won't work now, but it should in a few weeks/months
# For now, you can use TRUE ~ "Alien" and it works but is harder to explain

Wrangle III

Summarize

  • Although we store data about many observations…
  • …we often want to summarize across observations
    • This is like folding the tibble down to one row
  • We’ve seen functions that summarize vectors
    • length(), sum(), min(), max()
    • mean(), median(), sd(), var()
  • summarize() lets us use them on tibbles
    • It works very similarly to mutate()
    • It always creates a tibble as output

Summarize Live Coding

# SETUP: We will need tidyverse and an example dataset

library(tidyverse)

sales <- 
  tibble(
    customer = c(1, 2, 3, 1, 3),
    store = c("A", "A", "A", "B", "B"),
    items = c(25, 20, 16, 10, 5),
    spent = c(685, 590, 392, 185, 123)
  ) |> 
  print()

# ==============================================================================

# USECASE: Summarize the typical sales

my_summary <- 
  sales |> 
  summarize(
    avg_items = mean(items),
    avg_spent = mean(spent)
  ) |> 
  print()

# ==============================================================================

# PITFALL: Don't use summary() instead of summarize()

my_summary <- 
  sales |> 
  summary(
    avg_items = mean(items),
    avg_spent = mean(spent)
  ) |> 
  print()

# ==============================================================================

# USECASE: Use more than one summary function

my_summary <- 
  sales |> 
  summarize(
    total_items = sum(items),
    total_spent = sum(spent),
    avg_items = mean(items),
    avg_spent = mean(spent)
  ) |> 
  print()

# ==============================================================================

# USECASE: Use counting functions

my_counts <- 
  sales |> 
  summarize(
    n_sales = n(),
    n_customers = n_distinct(customer),
    n_stores = n_distinct(store)
  ) |> 
  print()

Group Summarize

  • We can also summarize a tibble by group
    • This is like folding the tibble multiple times
    • Specifically, we fold down to one row per group
  • The syntax for summarize is identical
    • The only difference is to the tibble
    • We first pass it through group_by()
    • Pipelines make this very easy

Group Summarize Live Coding

# SETUP: We will need tidyverse and an example dataset

library(tidyverse)

sales <- 
  tibble(
    customer = c(1, 2, 3, 1, 3),
    store = c("A", "A", "A", "B", "B"),
    items = c(25, 20, 16, 10, 5),
    spent = c(685, 590, 392, 185, 123)
  ) |> 
  print()

# ==============================================================================

# LESSON: We pass a tibble through group_by to group it

sales

sales |> group_by(store) # note the display says "grouped"

# ==============================================================================

# USECASE: We can then summarize and get stats per group

sales |> 
  group_by(store) |> 
  summarize(
    customers = n_distinct(customer),
    items_sold = sum(items),
    total_sales = sum(spent),
    avg_items = mean(items),
    avg_spent = mean(spent)
  )

# ==============================================================================

# SETUP: Let's get a larger, more realistic dataset

# Extra pane > Packages tab > Install > nycflights13

library("nycflights13")

flights

# ==============================================================================

# USECASE: Find the carrier with the lowest average delays

flights |> 
  group_by(carrier) |> 
  summarize(m_delay = mean(dep_delay, na.rm = TRUE)) |> 
  arrange(m_delay)

# ==============================================================================

# LESSON: We can also group by multiple variables

# USECASE: Let's find the day of the year with the most flights

flights |> 
  group_by(month, day) |> 
  summarize(n_flights = n()) |> 
  arrange(desc(n_flights))

# ==============================================================================

# PITFALL: Note how this differs from grouping by just day

flights |> 
  group_by(day)|> 
  summarize(n_flights = n()) |> 
  arrange(desc(n_flights))

Across

  • We can use across() to repeat an operation across multiple variables in a tibble
    • This makes our code shorter
    • It is faster to read and write
    • It is also less error-prone
  • So we can repeat a function in order to…
    • mutate() multiple variables
    • summarize() multiple variables

Across Live Coding

# SETUP: We will need tidyverse and an example dataset

library(tidyverse)

starwars

# ==============================================================================

# USECASE: Applying the same mutation to multiple variables is a pain

sw <- 
  starwars |> 
  mutate(
    hair_color = factor(hair_color),
    skin_color = factor(skin_color),
    eye_color = factor(eye_color)
  ) |> 
  print() # before

sw <- 
  starwars |> 
  mutate(
    across(
      .cols = c(hair_color, skin_color, eye_color), 
      .fns = factor
    )
  ) |> 
  print() #after

# ==============================================================================

# PITFALL: Don't forget to wrap the .cols part in c()

sw <- 
  starwars |> 
  mutate(
    across(
      .cols = mass, birth_year, 
      .fns = round,
      digits = 1
    )
  ) |> 
  print() # error

# ==============================================================================

# LESSON: To pass arguments to the inner function, add them inside across()

sw <- 
  starwars |> 
  mutate(
    across(
      .cols = c(mass, birth_year), 
      .fns = round,
      digits = 1
    )
  ) |> 
  print()

# ==============================================================================

# USECASE: You can also apply the same summary functions across variables

sw <- 
  starwars |> 
  summarize(
    height = mean(height, na.rm = TRUE),
    mass = mean(mass, na.rm = TRUE),
    birth_year = mean(birth_year, na.rm = TRUE)
  ) |> 
  print()

sw <- 
  starwars |> 
  summarize(
    across(
      .cols = c(height, mass, birth_year), 
      .fns = mean, 
      na.rm = TRUE
    )
  ) |> 
  print()

Separate and Unite

  • Tidy data needs one value per cell
  • So we may need to separate cells
    • e.g., What was the model of my first car?
    • "Nissan Altima 2003"
    • "Nissan" "Altima" "2003"
  • But some tasks require us to unite cells
    • e.g., What address should I mail to?
    • 123 "Main Street"
    • "123 Main Street"

Separate Live Coding

# SETUP: We will need tidyverse and an example dataset

library(tidyverse)

# Create some example data
dat <- 
  tibble(
    id = c("A_001_01", "A_002_01", "B_001_01", "B_002_01", "C_001_01", "C_002_01"),
    duration = c("01:16", "01:21", "01:49", "00:34", "00:32", "00:54")
  ) |> 
  print()

# ==============================================================================

# USECASE: Separate a column into multiple columns

dat2 <- 
  dat |> 
  separate(
    col = duration, 
    into = c("min", "sec"), 
    sep = ":"
  ) |> 
  print()

# ==============================================================================

# USECASE: This also works with more than two "into" columns

dat2 <- 
  dat |>  
  separate(
    col = id, 
    into = c("group", "subject", "time"), 
    sep = "_"
  ) |> 
  print()

# ==============================================================================

# TIP: To automatically convert strings into numbers, use convert

dat2 <- 
  dat |> 
  separate(
    col = id, 
    into = c("group", "subject", "time"), 
    sep = "_", 
    convert = TRUE
  ) |> 
  print()

# ==============================================================================

# PITFALL: Don't forget to close the "into" vector's parentheses

dat2 <- 
  dat|> 
  separate(col = duration, into = c("min", "sec", sep = ":")) #error

Unite Live Coding

# SETUP: We will need tidyverse and an example dataset

library(tidyverse)

dat2 <- 
  dat|> 
  separate(col = id, into = c("group", "subject", "time"), sep = "x") |> 
  separate(col = duration, into = c("min", "sec"), sep = ":") |> 
  print()

# ==============================================================================

# USECASE: Unite multiple columns into one string

dat3 <- 
  dat2 |> 
  unite(col = "newid", group, subject, time, sep = "-") |> 
  unite(col = "duration", min, sec, sep = ":") |> 
  print()

# ==============================================================================

# LESSON: Retain the columns being united with remove = FALSE

dat3 <- 
  dat2 |> 
  unite(col = "newid", group:time, sep = "", remove = FALSE) |> 
  print()

Pivot Longer and Wider

  • Both long and wide formats can be tidy
    • Long formats are better for MLM
    • Wide formats are better for SEM
  • It can be useful to quickly reshape a tibble
    • pivot_longer(): wide → long
    • pivot_wider(): long → wide

Pivot Longer Live Coding

# SETUP: We will need tidyverse and an example dataset (from workshop website)

library(tidyverse)

gradebook <- 
  read_csv("gradebook.csv") |> 
  print()

# ==============================================================================

# USECASE: We can pivot to long format by creating name and value variables

gradebook2 <- 
  gradebook |> 
  pivot_longer(
    cols = c(test1, test2, test3, test4, test5), 
    names_to = "test", 
    values_to = "grade"
  ) |> 
  print()

# ==============================================================================

# TIP: Use selection helpers to select columns quickly

gradebook2 <- 
  gradebook |> 
  pivot_longer(
    cols = test1:test5, 
    names_to = "test", 
    values_to = "grade"
  ) |> 
  print()

# ==============================================================================

# LESSON: Automatically remove the name prefix

gradebook2 <- 
  gradebook|> 
  pivot_longer(
    cols = starts_with("test"), 
    names_to = "test", 
    values_to = "grade",
    names_prefix = "test"
  ) |> 
  print()

Pivot Wider Live Coding

# SETUP: We will need tidyverse and an example dataset (from workshop website)

library(tidyverse)

diary <- read_csv("diary.csv") |> print()

# ==============================================================================

# USECASE: Reshape this long format to a wider format

diary_scale <- 
  diary |> 
  pivot_wider(
    names_from = "scale",
    values_from = "score"
  ) |> 
  print()

diary_day <- 
  diary |>  
  pivot_wider(
    names_from = "day",
    values_from = "score"
  ) |> 
  print()

# NOTE: There are thus multiple possible wide formats (for different uses)

# ==============================================================================

# LESSON: We can add a prefix to each name to avoid numeric names

diary_data

diary_day <- 
  diary |> 
  pivot_wider(
    names_from = "day",
    values_from = "score",
    names_prefix = "day_"
  ) |> 
  print()

# ==============================================================================

# LESSON: We can also pivot on multiple columns at once

diary_double <- 
  diary |> 
  pivot_wider(
    names_from = c("scale", "day"),
    values_from = "score"
  ) |> 
  print()

Practice III